This notebook walks through the creation of multitask models on MUV. The goal is to demonstrate that multitask methods outperform singletask methods on MUV.


In [1]:
%reload_ext autoreload
%autoreload 2
%pdb off
reload = True


Automatic pdb calling has been turned OFF

In [2]:
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset

dataset_file= "../datasets/muv.csv.gz"
dataset = load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))


Columns of dataset: ['mol_id' 'smiles' 'MUV-692' 'MUV-689' 'MUV-846' 'MUV-859' 'MUV-644'
 'MUV-548' 'MUV-852' 'MUV-600' 'MUV-810' 'MUV-712' 'MUV-737' 'MUV-858'
 'MUV-713' 'MUV-733' 'MUV-652' 'MUV-466' 'MUV-832']
Number of examples in dataset: 93127

Now, let's visualize some compounds from our dataset


In [3]:
from itertools import islice
from rdkit import Chem
from deepchem.utils.visualization import mols_to_pngs
from deepchem.utils.visualization import display_images

num_to_display = 12
molecules = []
for _, data in islice(dataset.iterrows(), num_to_display):
    molecules.append(Chem.MolFromSmiles(data["smiles"]))
display_images(mols_to_pngs(molecules))



In [4]:
from deepchem.featurizers.fingerprints import CircularFingerprint

featurizers = [CircularFingerprint(size=1024)]

In [5]:
MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
             'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
             'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
             'MUV-466', 'MUV-832']

In [6]:
import os
from deepchem.featurizers.featurize import DataFeaturizer

# The base_dir holds the results of all analysis
base_dir = "/scratch/users/rbharath/muv_multitask_analysis"
#Make directories to store the raw and featurized datasets.
feature_dir = os.path.join(base_dir, "features")
samples_dir = os.path.join(base_dir, "samples")

featurizer = DataFeaturizer(tasks=MUV_tasks,
                            smiles_field="smiles",
                            compound_featurizers=featurizers,
                            verbosity="low")

# Setting reload=True directs the featurizer to use existing featurization on disk if such exists.
featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir, shard_size=4096,
                                          reload=reload)

In [7]:
splittype = "scaffold"
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")

train_samples, valid_samples, test_samples = featurized_samples.train_valid_test_split(
    splittype, train_dir, valid_dir, test_dir, log_every_n=1000, reload=reload)

In [8]:
from deepchem.datasets import Dataset
print("Creating train dataset")
verbosity = None
train_dataset = Dataset(data_dir=train_dir, samples=train_samples, 
                        featurizers=featurizers, tasks=MUV_tasks,
                        verbosity=verbosity, reload=reload)
print("Creating valid dataset")
valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples, 
                        featurizers=featurizers, tasks=MUV_tasks,
                        verbosity=verbosity, reload=reload)
print("Creating test dataset")
test_dataset = Dataset(data_dir=test_dir, samples=test_samples, 
                       featurizers=featurizers, tasks=MUV_tasks,
                       verbosity=verbosity, reload=reload)


Creating train dataset
Creating valid dataset
Creating test dataset

In [9]:
input_transformers = []
output_transformers = []

In [10]:
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.tensorflow_models import TensorflowModel
from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier
from deepchem import metrics
from deepchem.metrics import Metric
import numpy as np
import numpy.random
model_dir = os.path.join(base_dir, "model")

MUV_task_types = {task: "Classification" for task in MUV_tasks}
params_dict = {"activation": ["relu"],
               "momentum": [.9],
               "batch_size": [50],
               "init": ["glorot_uniform"],
               "data_shape": [train_dataset.get_data_shape()],
               "learning_rate": [1e-3],
               "decay": [1e-6],
               "nb_hidden": [1000], 
               "nb_epoch": [1],
               "nesterov": [False],
               "dropouts": [(.5,)],
               "nb_layers": [1],
               "batchnorm": [False],
               "layer_sizes": [(1000,)],
               "weight_init_stddevs": [(.1,)],
               "bias_init_consts": [(1.,)],
               "num_classes": [2],
               "penalty": [0.], 
               "optimizer": ["sgd"],
               "num_classification_tasks": [len(MUV_task_types)]
              } 

def model_builder(task_types, params_dict, logdir, verbosity=None):
    return TensorflowModel(
        task_types, params_dict, logdir, 
        tf_class=TensorflowMultiTaskClassifier,
        verbosity=verbosity)

metric = Metric(metrics.roc_auc_score, np.mean)
optimizer = HyperparamOpt(model_builder, MUV_task_types, verbosity="low")
best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search(
    params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=model_dir)


Training for 1 epochs
Ending epoch 0: loss 0.00714142
ys[0]
[0.0 0.0 0.0 ..., 0.0 0.0 0.0]
y_preds[0]
[0 0 0 ..., 0 0 0]
Saving predictions to <open file '<fdopen>', mode 'w+b' at 0x7f280c626c00>
/home/rbharath/deepchem/deepchem/metrics/__init__.py:151: UserWarning: Error calculating metric mean-roc_auc_score: unknown format is not supported
  % (self.name, e))
Saving model performance scores to <open file '<fdopen>', mode 'w+b' at 0x7f280c626b70>
hyperparameters.compute
valid_score
nan
hyperparameter_tuple
('sgd', 1e-06, (1024,), 1, 'relu', (1000,), 50, 0.0, False, 'glorot_uniform', (1.0,), (0.1,), 2, 1, 17, 1000, False, (0.5,), 0.001, 0.9)
Model 0/1, Metric mean-roc_auc_score, Validation set 0: nan
	best_validation_score so far: -inf
No models trained correctly.

In [ ]: